WebsiteAgent: Introduce per-node XPath evaluation in extraction.

The new extraction sub-hash key `value` holds an XPath expression which
should be applied to each node to get the result value.

The previous directives `"text": true` and `"attr": "src"` are now
written as `"value": "text()"` and `"value": "@src"`, respectively.

With this enhancement, it is now possible for this agent to perform some
basic text processing on its own by making use of XPath functions like
`normalize-space`, `substring-after` and `translate`.

Akinori MUSHA 10 years ago
parent
commit
4d623c5893

+ 13 - 14
app/models/agents/website_agent.rb

@@ -23,14 +23,16 @@ module Agents
23 23
 
24 24
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
25 25
 
26
-      When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab.  An example:
26
+      When parsing HTML or XML, these sub-hashes specify how each extraction should be done.  The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`.  It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string.  Here's an example:
27 27
 
28 28
           "extract": {
29
-            "url": { "css": "#comic img", "attr": "src" },
30
-            "title": { "css": "#comic img", "attr": "title" },
31
-            "body_text": { "css": "div.main", "text": true }
29
+            "url": { "css": "#comic img", "value": "@src" },
30
+            "title": { "css": "#comic img", "value": "@title" },
31
+            "body_text": { "css": "div.main", "value": "text()" }
32 32
           }
33 33
 
34
+      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "text()" is to extract the enclosed text.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.
35
+
34 36
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
35 37
 
36 38
           "extract": {
@@ -70,9 +72,9 @@ module Agents
70 72
           'type' => "html",
71 73
           'mode' => "on_change",
72 74
           'extract' => {
73
-            'url' => { 'css' => "#comic img", 'attr' => "src" },
74
-            'title' => { 'css' => "#comic img", 'attr' => "alt" },
75
-            'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
75
+            'url' => { 'css' => "#comic img", 'value' => "@src" },
76
+            'title' => { 'css' => "#comic img", 'value' => "@alt" },
77
+            'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
76 78
           }
77 79
       }
78 80
     end
@@ -157,14 +159,11 @@ module Agents
157 159
                   return
158 160
                 end
159 161
                 result = nodes.map { |node|
160
-                  if extraction_details['attr']
161
-                    node.attr(extraction_details['attr'])
162
-                  elsif extraction_details['text']
163
-                    node.text()
164
-                  else
165
-                    error '"attr" or "text" is required on HTML or XML extraction patterns'
166
-                    return
162
+                  value, = node.xpath(extraction_details['value'])
163
+                  if value.is_a?(Float) && value.to_i == value
164
+                    value = value.to_i
167 165
                   end
166
+                  value.to_s
168 167
                 }
169 168
                 log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
170 169
               end

+ 22 - 0
db/migrate/20140723110551_adopt_xpath_in_website_agent.rb

@@ -0,0 +1,22 @@
1
+class AdoptXpathInWebsiteAgent < ActiveRecord::Migration
2
+  def up
3
+    Agent.where(type: 'Agents::WebsiteAgent').each do |agent|
4
+      next if agent.extraction_type == 'json'
5
+
6
+      agent.options_will_change!
7
+      agent.options['extract'].each { |name, extraction|
8
+        case
9
+        when extraction.delete('text')
10
+          extraction['value'] = 'text()'
11
+        when attr = extraction.delete('attr')
12
+          extraction['value'] = "@#{attr}"
13
+        end
14
+      }
15
+      agent.save!
16
+    end
17
+  end
18
+
19
+  def down
20
+    raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration"
21
+  end
22
+end

+ 4 - 4
spec/fixtures/agents.yml

@@ -10,8 +10,8 @@ jane_website_agent:
10 10
                  :expected_update_period_in_days => 2,
11 11
                  :mode => :on_change,
12 12
                  :extract => {
13
-                     :title => {:css => "item title", :text => true},
14
-                     :url => {:css => "item link", :text => true}
13
+                     :title => {:css => "item title", :value => 'text()'},
14
+                     :url => {:css => "item link", :value => 'text()'}
15 15
                  }
16 16
                }.to_json.inspect %>
17 17
 
@@ -27,8 +27,8 @@ bob_website_agent:
27 27
                  :expected_update_period_in_days => 2,
28 28
                  :mode => :on_change,
29 29
                  :extract => {
30
-                   :url => {:css => "#comic img", :attr => "src"},
31
-                   :title => {:css => "#comic img", :attr => "title"}
30
+                   :url => {:css => "#comic img", :value => "@src"},
31
+                   :title => {:css => "#comic img", :value => "@title"}
32 32
                  }
33 33
                }.to_json.inspect %>
34 34
 

+ 2 - 2
spec/models/agent_spec.rb

@@ -768,8 +768,8 @@ describe AgentDrop do
768 768
         url: 'http://dilbert.com/',
769 769
         mode: 'on_change',
770 770
         extract: {
771
-          url: { css: '[id^=strip_enlarged_] img', attr: 'src' },
772
-          title: { css: '.STR_DateStrip', text: true },
771
+          url: { css: '[id^=strip_enlarged_] img', value: '@src' },
772
+          title: { css: '.STR_DateStrip', value: 'text()' },
773 773
         },
774 774
       },
775 775
       schedule: 'every_12h',

+ 9 - 9
spec/models/agents/website_agent_spec.rb

@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do
11 11
         'url' => "http://xkcd.com",
12 12
         'mode' => 'on_change',
13 13
         'extract' => {
14
-          'url' => { 'css' => "#comic img", 'attr' => "src" },
15
-          'title' => { 'css' => "#comic img", 'attr' => "alt" },
16
-          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
14
+          'url' => { 'css' => "#comic img", 'value' => "@src" },
15
+          'title' => { 'css' => "#comic img", 'value' => "@alt" },
16
+          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
17 17
         }
18 18
       }
19 19
       @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2)
@@ -256,8 +256,8 @@ describe Agents::WebsiteAgent do
256 256
           'url' => "http://xkcd.com",
257 257
           'mode' => "on_change",
258 258
           'extract' => {
259
-            'url' => {'css' => "#topLeft a", 'attr' => "href"},
260
-            'title' => {'css' => "#topLeft a", 'text' => "true"}
259
+            'url' => {'css' => "#topLeft a", 'value' => "@href"},
260
+            'title' => {'css' => "#topLeft a", 'value' => "text()"}
261 261
           }
262 262
         }
263 263
         rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
@@ -389,9 +389,9 @@ describe Agents::WebsiteAgent do
389 389
         'url' => "http://www.example.com",
390 390
         'mode' => 'on_change',
391 391
         'extract' => {
392
-          'url' => { 'css' => "#comic img", 'attr' => "src" },
393
-          'title' => { 'css' => "#comic img", 'attr' => "alt" },
394
-          'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
392
+          'url' => { 'css' => "#comic img", 'value' => "@src" },
393
+          'title' => { 'css' => "#comic img", 'value' => "@alt" },
394
+          'hovertext' => { 'css' => "#comic img", 'value' => "@title" }
395 395
         },
396 396
         'basic_auth' => "user:pass"
397 397
       }
@@ -421,7 +421,7 @@ describe Agents::WebsiteAgent do
421 421
         'mode' => 'on_change',
422 422
         'headers' => { 'foo' => 'bar' },
423 423
         'extract' => {
424
-          'url' => { 'css' => "#comic img", 'attr' => "src" },
424
+          'url' => { 'css' => "#comic img", 'value' => "@src" },
425 425
         }
426 426
       }
427 427
       @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options)